################################################################################
##
## Purpose: This script creates all figures and tables from SI section 8
##
## Author: James Bisbee (james.h.bisbee@vanderbilt.edu)
##
## Input Files:
##  - ./data/prepped/finalData.RData: Prepped data from 9_DATA_final_build.R
##
## Output Files:
##  - ./output/figures/SI_figure_12.pdf
##  - ./output/figures/SI_figure_13.pdf
##  - ./output/figures/SI_figure_14.pdf
##  - ./output/figures/SI_figure_15.pdf
##  - ./output/figures/SI_figure_16.pdf
##  - ./output/figures/SI_figure_17.pdf
##  - ./output/figures/SI_figure_18.pdf
##  - ./output/figures/SI_figure_19.pdf
##  - ./output/figures/SI_figure_20.pdf
##  - ./output/figures/SI_figure_21.pdf
##  - ./output/figures/SI_figure_22.pdf
##  - ./output/figures/SI_figure_23.pdf
##  - ./output/tables/SI_table_15.tex
##  - ./output/tables/SI_table_16.tex
##
##
## See associated log file for compute environment, package versions, 
##  and date of most recent run.
##
################################################################################
rm(list = ls())
gc()
require(tidyverse)
require(ggridges)
require(visNetwork)
require(igraph)
require(BradleyTerry2)
require(fixest)
require(qvcalc)
require(ggrepel)
require(tidytext)
require(ranger)
require(fixest)


set.seed(123)

# Compute details
print(paste0('Compute environment from ',Sys.Date(),' run by Bisbee'))
if(Sys.info()['sysname'] == 'Windows') {
  ram_size = system("wmic MemoryChip get Capacity", intern = TRUE)[-1]
  model_name = system("wmic cpu get name", intern = TRUE)[2] # nocov
  vendor_id = system("wmic cpu get manufacturer", intern = TRUE)[2] # nocov
  
  print(list(ram = stringr::str_squish(ram_size)[1],
             vendor_id = stringr::str_squish(vendor_id),
             model_name = stringr::str_squish(model_name),
             no_of_cores = parallel::detectCores()))
} else if(Sys.info()['sysname'] == 'Linuxs') {
  splitted <- strsplit(system("ps -C rsession -o %cpu,%mem,pid,cmd", intern = TRUE), " ")
  df <- do.call(rbind, lapply(splitted[-1], 
                              function(x) data.frame(
                                cpu = as.numeric(x[2]),
                                mem = as.numeric(x[4]),
                                pid = as.numeric(x[5]),
                                cmd = paste(x[-c(1:5)], collapse = " "))))
  df
} else {
  cat("If not on Linux or Windows, you'll have to figure out your own solution to seeing the compute environment.")
}

sessionInfo()


# Loading data
load('./data/prepped/finalData.RData')


# SI Figure 12
slopeFn <- function(textGrepl,dropDims = 'AUTHOR|comb|LIKELY|SEVERE|UNSUBSTANTIAL|INCOHERENT',groupDim = 'ATTACK',wdth = 20,ords = NULL) {
  toplot <- utterance_level %>%
    filter(grepl(textGrepl,textclean)) %>%
    select(text = textclean,matches('SENT')) %>%
    group_by(text) %>%
    mutate(text = gsub('NA.*','',paste(stringi::stri_wrap(text,width = wdth)[1:min(length(.),10)],collapse = '\n'))) %>%
    select(-matches('lag$|error')) %>%
    gather(dim,sent,-text) %>%
    filter(!grepl(dropDims,dim)) %>%
    mutate(dim = ifelse(grepl(groupDim,dim),groupDim,dim)) %>%
    group_by(text,dim) %>%
    summarise(sent = mean(sent,na.rm=T)) %>%
    ungroup() %>%
    mutate(dim = gsub('_',' ',gsub('SENT_','',dim)))
  
  if(is.null(ords)) { 
    ords <- 1:length(unique(toplot$text))
  }
  
  levs <- levels(factor(unique(toplot$text)))[ords]
  
  toplot <- toplot %>%
    mutate(text = factor(text,levels = levs))
  
  toplot %>%
    ggplot(aes(x = text,y = sent,color = dim,group = dim,label = dim)) + 
    geom_line(size = 1.3) + 
    geom_text_repel(data = toplot %>% filter(text == levs[1]),hjust = 1,nudge_x = -.05,direction = 'both') + 
    geom_text_repel(data = toplot %>% filter(text == levs[length(levs)]),hjust = 0,nudge_x = .05,direction = 'both') + 
    geom_point(size = 5) + 
    scale_y_continuous(limits = c(0,1),name = 'Score (Pr label)',labels = scales::percent)
  
}

pdf('./output/figures/SI_figure_12.pdf',width = 9,height= 8)
slopeFn(textGrepl = "But, Chair Yellen, forgive me for the interruption.|Okay. I will take that. We have seen|invitation, they would love to have you in Hot Springs|Yet, seniors haven't seen a",
        dropDims = 'comb|OBSCENE|INCOHERENT|LIKELY|UNSUBSTANTIAL|SEXUALLY|SEVERE|COMMENTER|THREAT|TOXICITY|INSULT|INFLAMMATORY',
        groupDim = 'none',
        ords = c(1,4,2,3),
        wdth = 70) + 
  labs(title = 'Tone Results',
       subtitle = 'High-Scoring Responses to Yellen',
       x = NULL) + 
  theme_ridges() + 
  theme(axis.text.y = element_text(size = 10,vjust = .5),
        legend.position = 'none') + 
  coord_flip() + 
  scale_colour_brewer(name = 'Dimension',palette = "Dark2")
dev.off()


# SI Section 8.2
# SI Figure 13
pdf('./output/figures/SI_figure_13.pdf',width = 8,height = 5)
utterance_level %>%
  select(docID,yellenTime,date,chamber,matches('SENT_'),-matches('comb|LIKELY|UNSUB|INCOH|_lag')) %>%
  group_by(docID,date,yellenTime,chamber) %>%
  mutate(n = n()) %>%
  summarise_all(mean) %>%
  ungroup() %>%
  mutate(yellenTime = ifelse(date < as.Date('2014-01-01'),'Pre',
                             ifelse(yellenTime,'Yellen','Post'))) %>%
  ungroup() %>%
  pivot_longer(cols = starts_with("SENT_")) %>%
  drop_na(value) %>%
  mutate(name = gsub('SENT','',gsub('_',' ',name))) %>%
  ggplot(aes(x = date,y = value,group = yellenTime,size = n)) + 
  geom_point(shape = 21) + 
  geom_smooth(show.legend = F,method = 'lm',se = F,formula = 'y ~ poly(x,1)') + 
  annotate(geom = 'rect',xmin = as.Date('2014-01-01'),xmax = as.Date('2018-01-01'),
           ymin = -Inf,ymax = Inf,
           alpha = .2,fill = 'grey50') +
  geom_vline(xintercept = as.Date(c('2014-01-01','2018-01-01'))) + 
  annotate(geom = 'text',x = as.Date('2016-01-01'),y = Inf,label = 'Yellen',
           vjust = 1) + 
  theme_bw() + 
  theme(axis.text = element_text(size = 7),legend.position = 'bottom') + 
  facet_wrap(~name,scales = 'free') + 
  labs(x = 'Date',y = 'Predicted proportion')
dev.off()


# SI Section 8.3
# SI Figure 14
load('./data/prepped/hearings/topic_robustness.RData')

pdf('./output/figures/SI_figure_14.pdf',width = 7,height = 5)
cohres %>%
  as_tibble() %>%
  group_by(k) %>%
  filter(!is.infinite(mean_logratio)) %>%
  summarise(lr = mean(mean_logratio,na.rm=T)) %>%
  ggplot(aes(x = k,y = lr)) + 
  geom_point() + 
  geom_line() + 
  geom_vline(xintercept = 100) + 
  labs(x = 'k',
       y = 'Coherence (avg. log ratio)')
dev.off()

# SI Figure 15
load('./data/prepped/hearings/topic_models_100.RData')
fullPhi <- lda_model$topic_word_distribution %>%
  data.frame() %>%
  mutate(topic = row_number()) %>%
  gather(word,phi,-topic) %>%
  as_tibble()

topWords <- fullPhi %>%
  group_by(topic) %>%
  slice_max(phi,n = 20)

pdf('./output/figures/SI_figure_15.pdf',width = 7,height = 5)
topWords %>%
  filter(topic %in% c(65,
                      97)) %>%
  group_by(word) %>%
  mutate(mult = ifelse(n() > 1,'Multi','single')) %>%
  ungroup() %>%
  mutate(word = reorder_within(word,phi,topic)) %>%
  ggplot(aes(x = phi,y = reorder(word,phi),fill = mult)) + 
  geom_bar(stat = 'identity') + ylab('')+ 
  scale_y_reordered() + 
  facet_wrap(~topic,scales = 'free',
             labeller = labeller(topic = c('65' = 'Inflation Topic',
                                           '97' = 'Interest Rate Topic'))) + 
  scale_fill_manual(guide = 'none',values = c('grey30','grey70'))
dev.off()


# SI Figure 16
toAnal <- doc_topic_distr %>%
  left_join(utterance_level %>%
              select(fullInd,textclean,text,speaker,position,gender,party,chamber,nominate_dim1,age,docID),
            by = c('id' = 'fullInd')) %>%
  mutate(date = as.Date(gsub('fed|\\.txt','',docID)))

toplot <- toAnal %>%
  group_by(topic,date,position,chamber,party) %>%
  summarise(nUtterances = n(),
            theta = sum(theta),.groups = 'drop')

adjMat <- NULL
for(k in 1:100) {
  tmp <- topWords %>% filter(topic == k)
  adjMat <- bind_rows(adjMat,
                      topWords %>%
                        filter(topic != k,
                               word %in% tmp$word) %>%
                        group_by(topic) %>%
                        summarise(phi = sum(phi),
                                  terms = paste(word,collapse = ','),.groups = 'drop') %>%
                        rename(alter = topic) %>%
                        mutate(ego = k))
}


g <- graph_from_adjacency_matrix(as.matrix(data.frame(adjMat %>%
                                                        select(-terms) %>% 
                                                        spread(ego,phi) %>% 
                                                        select(-alter) %>% 
                                                        mutate_all(function(x) ifelse(is.na(x),0,x)))),weighted = T)

clusts <- igraph::walktrap.community(g)

allNodes <- toAnal %>%
  mutate(topic = as.numeric(topic)) %>%
  group_by(topic) %>%
  summarise(value = mean(theta),.groups = 'drop') %>%
  mutate(comm = clusts$membership)

top3 <- topWords %>%
  filter(!grepl('_',word),
         nchar(word) > 2) %>%
  group_by(topic) %>%
  slice_max(phi,n = 3) %>%
  summarise(label = paste(word,collapse = ','))

top3 <- top3 %>%
  left_join(toAnal %>%
              mutate(topic = as.numeric(topic)) %>%
              group_by(topic) %>%
              summarise(n = n(),
                        thetaWgtIdeo = sum(theta*nominate_dim1),.groups = 'drop'))

nodes <- data.frame(allNodes %>%
                      left_join(top3) %>%
                      rename(id = topic))
edges <- data.frame(adjMat %>% filter(ego %in% nodes$id,alter %in% nodes$id) %>% select(source = ego,target = alter,weight = phi))

nodes <- data.frame(allNodes %>%
                      left_join(top3) %>%
                      rename(id = topic)) %>%
  filter(grepl('job|employ|wage|gas|tax|spend|inflation|chair|thank|
               price|debt|cost|profit|audit|risk|education|
               econom|deficit|budget|capital|federal|regulat|percent|
               product|income|sav|policy|consumer|rule|requir|interest|
               financ|bank|trade|currenc|fund|house|market|secur',label))

edges <- data.frame(adjMat %>% filter(ego %in% nodes$id,alter %in% nodes$id,phi > .06) %>% select(from = ego,to = alter,value = phi))
nodes$color <- RColorBrewer::brewer.pal(5,'RdYlBu')[nodes$comm]

# Need to manually save SI figure 16 from the Viewer tab in RStudio
visNetwork(nodes, edges) %>% 
          visLayout(randomSeed = 123) %>%
          visPhysics(stabilization = FALSE)



# SI Figure 17
carriageReturner <- Vectorize(function(x,n = 10) {
  x2 <- NULL
  tmp <- str_split(x,' ')[[1]]
  for(i in seq(1,length(tmp),by = n)) {
    x2 <- c(x2,paste(tmp[i:(i+(n-1))],collapse = ' '))
  }
  if(any(is.na(x2))) {
    x2 <- x2[-which(is.na(x2))]
  }
  return(paste(gsub(' NA','',x2),collapse = '\n'))
})


toplot <- toAnal %>%
  mutate(topic = as.numeric(topic)) %>%
  left_join(allNodes) %>%
  left_join(top3) %>%
  filter(!grepl('--+|\\?',textclean)) %>%
  filter(nchar(textclean) > 220 & nchar(textclean) < 280) %>%
  group_by(topic) %>%
  slice_max(theta,n = 2,with_ties = F) 

toplot2 <- toplot %>%
  group_by(label,value,topic) %>%
  summarise(textclean = paste(paste0(' - ',carriageReturner(gsub('^Mr\\. Miller of California\\. ','',textclean),
                                                            n = 17)),collapse = '\n')) %>%
  ungroup()

(p2 <- toplot2 %>%
    filter(topic %in% c(1,20,38,58,63,65,89,90,100)) %>%
    mutate(label = paste0('Topic ',topic,':\n',label)) %>%
    ggplot(aes(x = 0,y = reorder(label,value),label = textclean)) +
    geom_label(size = 3.2,hjust = 0) + 
    xlim(c(0,2)) + 
    xlab('') + ylab('') + 
    theme_ridges() + 
    theme(axis.text.x = element_blank(),
          axis.text.y = element_text(size = 9,vjust = .5),
          plot.margin = unit(c(0.15,0,.2,0), "cm")))

pdf('./output/figures/SI_figure_17.pdf',
    width = 9,height = 11)
p2
dev.off()

# SI Figure 18
# Looking at STM
load('./data/prepped/hearings/stm_results.RData')

toAnal <- out$meta %>%
  as_tibble() %>%
  mutate(fullInd = row_number()) %>%
  select(-textclean) %>%
  left_join(
    fit$theta %>%
      data.frame() %>%
      mutate(fullInd = row_number()) %>%
      as_tibble() %>% 
      gather(topic,theta,-fullInd) %>%
      mutate(topic = gsub('X','topic_',topic))) %>%
  group_by(fullInd) %>%
  filter(theta == max(theta))

toAnal <- out$meta %>%
  as_tibble() %>%
  mutate(fullInd = row_number()) %>%
  select(-textclean) %>%
  left_join(
    fit$theta %>%
      data.frame() %>%
      mutate(fullInd = row_number()) %>%
      as_tibble() %>% 
      gather(topic,theta,-fullInd) %>%
      mutate(topic = gsub('X','topic_',topic)))


toplot0 <- toAnal %>%
  filter(!grepl('YELLEN',opensecretsID)) %>%
  group_by(topic,interrupted) %>%
  summarise(theta = mean(theta)) %>%
  ungroup() %>%
  spread(interrupted,theta,sep = '_') %>%
  mutate(diff = interrupted_1 - interrupted_0)

toplot1 <- plot(prep,covariate = 'interrupted',
                topics = 1:100,
                method = 'pointestimate')

toplot2 <- plot(prep,covariate = 'comparisons',
                topics = 1:100,
                method = 'difference',cov.value1 = 'FEDYELLEN',cov.value2 = 'FEDBERNANKE')

toplot3 <- plot(prep,covariate = 'comparisons',
                topics = 1:100,
                method = 'difference',cov.value1 = 'FEDYELLEN',cov.value2 = 'FEDPOWELL')

toplot4 <- plot(prep,covariate = 'comparisons',
                topics = 1:100,
                method = 'difference',cov.value1 = 'FEDYELLEN',cov.value2 = 'FEDGREENSPAN')


toplotFull <- NULL
for(i in 2:4) {
  
  toplotFull <- bind_rows(toplotFull,
                          as_tibble(data.frame(mInt = unlist(toplot1$means),
                                               lbInt = sapply(toplot1$cis,function(x) x[1]),
                                               ubInt = sapply(toplot1$cis,function(x) x[2])) %>%
                                      mutate(topic = paste0('topic_',row_number()))) %>%
                            left_join(as_tibble(data.frame(m = unlist(get(paste0('toplot',i))$means),
                                                           lb = sapply(get(paste0('toplot',i))$cis,function(x) x[1]),
                                                           ub = sapply(get(paste0('toplot',i))$cis,function(x) x[2])) %>%
                                                  mutate(topic = paste0('topic_',row_number()),
                                                         ref = ifelse(i == 2,'Bernanke',
                                                                      ifelse(i == 3,'Powell','Greenspan'))))))
}


pdf('./output/figures/SI_figure_18.pdf',width = 9,height = 4)
toplotFull %>%
  left_join(toAnal %>%
              filter(!grepl('YELLEN',opensecretsID)) %>%
              group_by(topic) %>%
              summarise(theta = mean(theta)) %>%
              ungroup()) %>%
  mutate(sig = ifelse((lbInt > 0 | ubInt < 0) & (lb > 0 | ub < 0),1,0)) %>%
  drop_na(ref) %>%
  ggplot(aes(x = m,y = mInt,size = theta,alpha = factor(sig))) + 
  geom_point() + 
  geom_vline(xintercept = 0,linetype = 'dashed') + 
  geom_hline(yintercept = 0,linetype = 'dashed') + 
  geom_errorbarh(aes(xmin = lb,xmax = ub),size = .5) + 
  geom_errorbar(aes(ymin = lbInt,ymax = ubInt),size = .5) + 
  geom_point(shape = 21,fill = 'white') + 
  scale_alpha_manual(name = 'Significant',values = c(.1,1),labels = c('Insig','Sig')) + 
  scale_size_continuous(name = 'Prevalence') + 
  facet_grid(~ref) + 
  ylab(bquote('' %<-% ' Topic Interrupted Less ... Topic Interrupted More ' %->% '')) + 
  xlab(bquote('' %<-% ' Topic Used More by Fed Chairs ... Topic Used More by Yellen ' %->% '')) + 
  theme_ridges() + 
  theme(axis.title.x = element_text(hjust = .5,vjust = 0),
        axis.title.y = element_text(hjust = .5,vjust = 0),
        legend.position = 'bottom')
dev.off()


# SI Figure 19
dict <- list(TOPIC_price_stability = 'price(s)*|inflate|inflation|inflationary|HICP|CPI|PCE|PCE index|PCE inflation|deflation|deflator|deflationary|deflate|hyperinflation|hyperinflationary',
             TOPIC_financial_stability = 'financial (in)*stability|bank (in)*stability|(financial )*crisis|financial stress|financial risk|systemic risk|contagion|financial shocks|bubble|financial imbalance|misalignment|credit growth|banks|insurers|hedge funds|investment funds|financial markets|securities markets|leverage|capital|derivatives|off-balance sheet exposures|special purpose vehicles|off-balance sheet vehicles|payment systems|settlement systems|central securities depositories|non-performing loans|npls|non-performing exposures|foreign currency loans|correlated exposures',
             TOPIC_employment = 'employ(ee|er)*|(un)*employment|underemployment|firing|fixed-term|full-time|part-time|inactivity|job(s)*|jobless|labo(u)*r|labo(u)*r force|labo(u)*r market|self-employed|temporary|vacanc|work(er)*|workers|working|working( age| time)*|works',
             TOPIC_international_developments = 'Trade|International|Global|Cross-border|Emerging markets|Emerging economies|Outside the euro area|Outside the EU|Geopolitic|China|Chinese|Lehman|United States|The US|USA|America|Canada|Canadian|Japan|Japanese|Russia|Russian|India|Indian|Turkey|Turkish|Argentina|Argentinian|Brexit|United Kingdom|England|Norway|Norwegian|Enlargement|Developing economies|Developing countries|World Bank|IMF|War|Middle East|Far East|OPEC|WTO|Exchange rate|Sweden|Swedish|Oil|Gas|Commodity|G7|G20|Korea|Korean|Northern Rock|Terrorism|Terrorist|Africa|African|Asia|Australia|Oversea|External representation|IRE|Dollar|Pound|Ruble|Yuan|Yen|Renminbi|LTCM|External demand|Exports|Imports|Advanced economies|Value chain|US Treasuries|Fed|Federal Reserve|Bank of England|Scotland|Scottish|PBOC|Basel|Bank of International Settlements|BIS|Washington|New York',
             TOPIC_payments_issues = 'Payment|Payment systems|CCP|Clearing|Market infrastructures|Digital euro|Wholesale transactions|Bitcoin|Stablecoins|Libra|Diem|Instant payments|CBDC|Cash|Banknotes|Coins|Card|E-money|Private money|Central bank money|Digital dollar|TARGET|TARGET2|T2S|real-time gross settlement|SEPA|TIPS|Payment|Settlement|DLT|Ledger|Blockchain|Token|Digital currency|Cryptocurrencies|Crypto-currencies|Crypto-assets|Cryptoassets|Big tech firms|Big techs',
             TOPIC_EMU_governance = 'Fiscal policy|Fiscal rules|Fiscal Board|Bailout|Bail-in|Single Supervisory Mechanism SSM|Single Resolution Mechanism|SRM|Banking supervision|Microprudential|Macroprudential|Prudential policies|Macroeconomic policies|Five Presidents|Four Presidents|Economic and Monetary Union|EMU|EU budget|Multiannual financial framework|MFF|SURE|Stability and Growth Pact|SGP|Stability and growth|Banking Union|Deposit insurance|EDIS|NGEU|Next Generation|Recovery and Resilience|Fiscal capacity|BICC|Risk-sharing|Transfer union|Policy mix|International role of the euro|IRE|Moral hazard|Financial assistance|Troika|European Stability Mechanism|ESM|Corrective arm|Budget|Capital Markets Union|CMU|Integration|Deepening|Country Specific Recommendations|CSRs|Euro adoption|Changeover|Convergence|Divergence',
             TOPIC_environment = 'green|climate|climate change|green bond|sustainable finan|wildfire|hurricane|natural disaster|emission|co2|carbon dioxide|fossil fuel|pollut|greenwashing|carbon|brown|harm|fossil|fuel|environmental|environment|transition|ecology|ecological|taxonomy|greening|pollution|polluting|biodiversity|emissions|weather')

dict <- lapply(dict,function(x) tolower(x))

for(t in names(dict)) {
  utterance_level[[t]] <- grepl(dict[[t]],utterance_level$textclean)
}

pdf('./output/figures/SI_figure_19.pdf',width = 8,height = 5)
utterance_level %>%
  group_by(chamber,date,yellenTime) %>%
  summarise_at(vars(matches('TOPIC_[a-z]')),mean) %>%
  ungroup() %>%
  gather(topic,prop,-chamber,-date,-yellenTime) %>%
  mutate(topic = gsub('Emu','EMU',str_to_title(gsub('_',' ',gsub('TOPIC_','',topic))))) %>%
  mutate(yellenTime = ifelse(date < as.Date('2014-01-01'),'Pre',
                             ifelse(yellenTime,'Yellen','Post'))) %>%
  filter(!grepl('EMU',topic)) %>%
  ggplot(aes(x = date,y = prop,group = yellenTime)) + 
  geom_point() + 
  annotate(geom = 'rect',xmin = as.Date('2014-01-01'),xmax = as.Date('2018-01-01'),
           ymin = -Inf,ymax = Inf,
           alpha = .2,fill = 'grey50') +
  geom_vline(xintercept = as.Date(c('2014-01-01','2018-01-01'))) + 
  annotate(geom = 'text',x = as.Date('2016-01-01'),y = Inf,label = 'Yellen',
           vjust = 1) + 
  geom_smooth(method = 'lm',se = F) + 
  labs(x = 'Date',y = 'Proportion of utterances',
       title = 'Financial Topic Analysis',
       subtitle = 'Keyword-based topic identification') + 
  facet_wrap(~topic,scales = 'free')
dev.off()


# SI Table 14
toAnal <- utterance_level %>%
  mutate(spkr = factor(ifelse(grepl('YELLEN',opensecretsID),'Yellen',
                              ifelse(grepl('FED',opensecretsID),'FED','Others')),
                       levels = c('Others','FED','Yellen'))) %>%
  select(date,ind,spkr,interrupted,matches('TOPIC_[a-z]')) %>%
  gather(topic,prop,-date,-ind,-spkr,-interrupted)

summary(m <- feols(interrupted ~ topic | date,
                   toAnal %>% filter(prop,
                                     !grepl('EMU',topic)) %>%
                     mutate(topic = relevel(factor(gsub('TOPIC','',topic)),ref = '_employment'),
                            spkr = relevel(factor(paste0('_',spkr)),ref = '_Others'))))

summary(m2 <- feols(interrupted ~ topic + spkr | date,
                    toAnal %>% filter(prop,
                                      !grepl('EMU',topic)) %>%
                      mutate(topic = relevel(factor(gsub('TOPIC','',topic)),ref = '_employment'),
                             spkr = relevel(factor(paste0('_',spkr)),ref = '_Others'))))

summary(m3 <- feols(interrupted ~ topic*spkr | date,
                    toAnal %>% filter(prop,
                                      !grepl('EMU',topic)) %>%
                      mutate(topic = relevel(factor(gsub('TOPIC','',topic)),ref = '_employment'),
                             spkr = relevel(factor(paste0('_',spkr)),ref = '_Others'))))

etable(m,m2,m3,
       signif.code = c('***' = .001,'**' = .01,'*' = .05,'\\dag' = .1),
       file = './output/tables/SI_table_14.tex')


# SI Figure 20
utterance_level <- utterance_level %>%
  arrange(docID,fullInd) %>%
  mutate_at(vars(matches('topic70.*_\\d')),list(lag = ~lag(.)))

dyadToAnal <- utterance_level %>%
  mutate_at(vars(matches('lag')),function(x) ifelse(is.na(x),0,x)) %>%
  mutate(votepct_rel = ifelse(is.infinite(votepct_rel),1,votepct_rel),
         respondingTo = relevel(factor(respondingTo),ref = 'FEDBERNANKE'))

dims <- colnames(utterance_level %>% select(matches('SENT_'),-matches('_lag|error')) %>% select(-matches('SEVERE|AUTHOR|LIKELY')))

summary(modDyadOG <- feols(as.formula(paste0('interruptor ~ respondingTo + ',
                                             paste(paste0('topic_',1:100,'_lag'),collapse = ' + '),
                                             ' + ',
                                             paste(paste0('scale(',dims[-which(grepl('comb',dims))],'_lag)'),collapse = ' + '),
                                             ' + scale(SENT_combAttack_lag) + scale(SENT_combIncoh_lag) + scale(SENT_combToxic_lag)',
                                             ' + poly(scale(log(nchars_lag+1)),3) + poly(scale(log(tot_utterances)),3) + interrupted',
                                             '| opensecretsID + docID')),
                           dyadToAnal %>% 
                             filter(all > 30,ind > mind,
                                    yellen_vote != 1),
                           cluster = 'opensecretsID + respondingTo'))

summary(modDyadGrp70 <- feols(as.formula(paste0('interruptor ~ respondingTo + ',
                                                paste(paste0('topic70Grped_',1:70,'_lag'),collapse = ' + '),
                                                ' + ',
                                                paste(paste0('scale(',dims[-which(grepl('comb',dims))],'_lag)'),collapse = ' + '),
                                                ' + scale(SENT_combAttack_lag) + scale(SENT_combIncoh_lag) + scale(SENT_combToxic_lag)',
                                                ' + poly(scale(log(nchars_lag+1)),3) + poly(scale(log(tot_utterances)),3) + interrupted',
                                                '| opensecretsID + docID')),
                              dyadToAnal %>% 
                                filter(all > 30,ind > mind,
                                       yellen_vote != 1),
                              cluster = 'opensecretsID + respondingTo'))

summary(modDyadSpkr70 <- feols(as.formula(paste0('interruptor ~ respondingTo + ',
                                                 paste(paste0('topic70Spkr_',1:70,'_lag'),collapse = ' + '),
                                                 ' + ',
                                                 paste(paste0('scale(',dims[-which(grepl('comb',dims))],'_lag)'),collapse = ' + '),
                                                 ' + scale(SENT_combAttack_lag) + scale(SENT_combIncoh_lag) + scale(SENT_combToxic_lag)',
                                                 ' + poly(scale(log(nchars_lag+1)),3) + poly(scale(log(tot_utterances)),3) + interrupted',
                                                 '| opensecretsID + docID')),
                               dyadToAnal %>% 
                                 filter(all > 30,ind > mind,
                                        yellen_vote != 1),
                               cluster = 'opensecretsID + respondingTo'))


# Substantive topics: JOPRR1
load('./data/prepped/hearings/topic_models_100.RData')
lda_model$get_top_words(n = 10) %>%
  data.frame() %>%
  rename_all(function(x) gsub('X','topic_',x)) %>%
  mutate(top_word = row_number()) %>%
  as_tibble() %>%
  t()


insubs <- c(1, 
            5, 
            9, 
            11, 
            12,
            19,
            20,
            22,
            24,
            27,
            29,
            31,
            37,
            39,
            41,
            42,
            46,
            47,
            49,
            51,
            54,
            57,
            69,
            71,
            76,
            83,
            85,
            88,
            90,
            91,
            92,
            93,
            96,
            97)

substantive <- paste0('topic_',setdiff(1:100,insubs))

summary(modDyadSubs <- feols(as.formula(paste0('interruptor ~ respondingTo + ',
                                               paste(paste0(substantive,'_lag'),collapse = ' + '),
                                               ' + ',
                                               paste(paste0('scale(',dims[-which(grepl('comb',dims))],'_lag)'),collapse = ' + '),
                                               ' + scale(SENT_combAttack_lag) + scale(SENT_combIncoh_lag) + scale(SENT_combToxic_lag)',
                                               ' + poly(scale(log(nchars_lag+1)),3) + poly(scale(log(tot_utterances)),3) + interrupted',
                                               '| opensecretsID + docID')),
                             dyadToAnal %>% 
                               filter(all > 30,ind > mind,
                                      yellen_vote != 1),
                             cluster = 'opensecretsID + respondingTo'))




# Prognostic topics
forVimp <- dyadToAnal %>%
  select(interrupted,matches('topic_\\d+$')) %>%
  mutate(interrupted = factor(interrupted))

rangMod <- ranger(interrupted ~ .,forVimp,importance = 'permutation')

topWord <- lda_model$get_top_words(n = 50) %>%
  data.frame() %>%
  rename_all(function(x) gsub('X','topic_',x)) %>%
  mutate(top_word = row_number()) %>%
  as_tibble()


# Looking at which topics are interrupted the most
toplot <- data.frame(topic = names(rangMod$variable.importance),
                     vimp = rangMod$variable.importance / rangMod$prediction.error) %>%
  left_join(topWord %>%
              gather(topic,term) %>%
              group_by(topic) %>%
              slice(1:5) %>%
              summarise(terms = paste(term,collapse = ', '))) %>%
  as_tibble()

pdf('./output/figures/SI_figure_20.pdf',width =7,height= 9)
toplot %>%
  ggplot(aes(x = vimp,y = reorder(terms,vimp))) + 
  geom_bar(stat = 'identity') + 
  labs(x = 'Variable Importance (% reduction in error)',
       y = 'Topic',
       title = 'Variable Importance',
       subtitle = 'Topics most effective at predicting interruption')
dev.off()


# SI Table 15
ctrls <- toplot %>%
  arrange(desc(vimp)) %>%
  slice(1:20) %>%
  pull(topic)


summary(modDyadVIMP <- feols(as.formula(paste0('interruptor ~ respondingTo + ',
                                               paste(paste0(ctrls,'_lag'),collapse = ' + '),
                                               ' + ',
                                               paste(paste0('scale(',dims[-which(grepl('comb',dims))],'_lag)'),collapse = ' + '),
                                               ' + scale(SENT_combAttack_lag) + scale(SENT_combIncoh_lag) + scale(SENT_combToxic_lag)',
                                               ' + poly(scale(log(nchars_lag+1)),3) + poly(scale(log(tot_utterances)),3) + interrupted',
                                               '| opensecretsID + docID')),
                             dyadToAnal %>% 
                               filter(all > 30,ind > mind,
                                      yellen_vote != 1),
                             cluster = 'opensecretsID + respondingTo'))

dict <- c('respondingToFEDYELLEN' = 'Yellen (ref. Bernanke)','respondingToFEDPOWELL' = 'Powell (ref. Bernanke)',
          'respondingToFEDGREENSPAN' = 'Greenspan (ref. Bernanke)','scale(age)' = 'Age (scaled)',
          'scale(votepct)' = 'Vote Share (scaled)','scale(nominate_dim1)' = 'Ideology (scaled)',
          'GOP' = 'Republican (ref. Democrat)','chamberSenate' = 'Senate (ref. House)',
          'genderM' = 'Male (ref. Female)','scale(seniority)' = 'Seniority (scaled)','scale(constrain_empower_tot)' = 'Fed Oversight Sponsor',
          'yellen_vote' = 'Oppose Yellen Conf.','interrupted' = 'Interrupted','docID' = 'Hearing','opensecretsID' = 'Speaker')


etable(modDyadOG,modDyadGrp70,modDyadSpkr70,modDyadSubs,modDyadVIMP,
       keep = 'ref. |scaled|Oversight|Oppose|Interrupted',
       order = c('^Yellen','Powell','Greenspan','Age','Vote','Ideology','Republican','Senate','Male','Seniority','Oversight','Oppose','Interr'),
       dict = dict,extralines = list('LDA Topics (see columns)' = c('Yes','Yes','Yes','Yes','Yes'),
                                     'Tone Probabilities' = c('Yes','Yes','Yes','Yes','Yes')),
       depvar = F,digits = 3,digits.stats = 3,signif.code = c('***' = .001,'**' = .01,'*' = .05,'\\dag' = .1),replace = T,
       headers = list(c('Per Utterance','Per chunk','Per speaker','Substantive','Prognostic'),
                      c('(100 Topics)','(70 Topics)','(70 Topics)','(40 topics)','(20 topics)')),
       file = './output/tables/SI_table_15.tex')


# SI Figure 21
toplot <- NULL
for(grs in c("(m not(sure|certain|positive|aware)|(do not|don't) know|m unsure)",
             "back to you|follow up with you|look into|(need|have) to (check|confer|look)",
             "(i am sorry|i'm sorry|i apologize|forgive me)")) {
  toplot <- utterance_level %>%
    select(docID,chamber,opensecretsID,text,interrupted) %>%
    filter(grepl('FED',opensecretsID)) %>%
    group_by(opensecretsID) %>%
    mutate(nTot = n()) %>%
    filter(grepl('FED',opensecretsID),
           grepl(grs,tolower(text))) %>%
    count(opensecretsID,interrupted,nTot) %>%
    ungroup() %>%
    mutate(pct = n / nTot,
           opensecretsID = factor(str_to_title(gsub('FED','',opensecretsID)),
                                  levels = c('Greenspan','Bernanke','Yellen','Powell'))) %>%
    mutate(grp = grs) %>%
    bind_rows(toplot)
}

toplot <- toplot %>% 
  group_by(opensecretsID,grp) %>%
  mutate(lab = pct / sum(pct))  %>%
  ungroup() %>%
  mutate(grp = ifelse(grepl('sorry',grp),'Apologetic Language',
                      ifelse(grepl('unsure',grp),'Uncertain Language','Delaying Language')))

pdf('./output/figures/SI_figure_21.pdf',width = 8,height = 4)
toplot %>%
  ggplot(aes(x = opensecretsID,y = pct,fill = factor(interrupted))) + 
  geom_bar(stat = 'identity',color = 'grey70',linewidth = .2) + 
  scale_y_continuous(labels = scales::percent) + 
  scale_fill_manual(values = c('white','grey30'),labels = c('Not interrupted','Interrupted')) + 
  labs(x = 'Fed Chair',
       y = 'Proportion of total utterances',
       fill = 'Utterance',
       title = 'Uncertainty and Obsequiousness',
       subtitle = 'Proportions of utterances with keywords, of which interrupted in gray') + 
  theme_bw() + 
  geom_text(data = toplot %>%
              filter(interrupted == 1),
            aes(label = paste0(round(lab*100,1),'%')),
            vjust = -.2) + 
  facet_wrap(~grp) + 
  theme(legend.position = 'bottom',
        axis.text.x = element_text(size = 8))
dev.off()


# SI Figure 22
load('./output/chatGPT_polite_BTM_aggression_cleaned.RData')

res <- res %>%
  mutate(cleaned = as.numeric(str_extract(cleaned,'(-)*\\d'))) %>%
  mutate(cleaned = ifelse(cleaned == 0,-1,cleaned))

tab <- res %>%
  filter(cleaned != -1) %>%
  filter(direction == 'more',!reversed) %>%
  mutate(moreAgg = ifelse((cleaned == 1 & direction == 'more') | 
                            (cleaned == 2 & direction == 'less'),fed_1,fed_2)) %>%
  mutate(lessAgg = ifelse((cleaned == 1 & direction == 'more') |
                            (cleaned == 2 & direction == 'less'),fed_2,fed_1)) %>%
  mutate(length = ifelse(n_1 < 5,'3-4',
                         ifelse(n_1 < 7,'5-6',
                                ifelse(n_1 < 9,'7-8','9+')))) %>%
  count(moreAgg,lessAgg,length)

pdf('./output/figures/SI_figure_22.pdf',width = 8,height = 4)
tab %>%
  mutate(dyadID = ifelse((moreAgg == 'Bernanke' & lessAgg == 'Greenspan') | 
                           (moreAgg == 'Greenspan' & lessAgg == 'Bernanke'),'Bernanke_Greenspan',
                         ifelse((moreAgg == 'Bernanke' & lessAgg == 'Powell') | 
                                  (moreAgg == 'Powell' & lessAgg == 'Bernanke'),'Bernanke_Powell',
                                ifelse((moreAgg == 'Bernanke' & lessAgg == 'Yellen') | 
                                         (moreAgg == 'Yellen' & lessAgg == 'Bernanke'),'Bernanke_Yellen',
                                       ifelse((moreAgg == 'Greenspan' & lessAgg == 'Powell') | 
                                                (moreAgg == 'Powell' & lessAgg == 'Greenspan'),'Greenspan_Powell',
                                              ifelse((moreAgg == 'Greenspan' & lessAgg == 'Yellen') | 
                                                       (moreAgg == 'Yellen' & lessAgg == 'Greenspan'),'Greenspan_Yellen','Powell_Yellen')))))) %>%
  group_by(dyadID,length) %>%
  mutate(tot = sum(n)) %>%
  ungroup() %>%
  mutate(share = n / tot) %>%
  group_by(dyadID) %>%
  select(-dyadID,-tot,-n) %>%
  mutate(lessAgg = factor(lessAgg,levels = rev(c('Bernanke','Greenspan','Powell','Yellen'))),
         moreAgg = factor(moreAgg,levels = (c('Bernanke','Greenspan','Powell','Yellen')))) %>%
  ggplot(aes(x = lessAgg,y = moreAgg,fill = share)) + 
  geom_tile() + 
  geom_text(aes(label = paste0(round(share*100,0),'%')),size =3.5) + 
  scale_fill_gradient2(midpoint = .5,low = 'darkred',mid = 'white',high = 'darkgreen') + 
  theme_bw()+
  labs(x = 'Less aggressive conversations',
       y = 'More aggressive conversations',
       fill = 'Proportion of\ncomparisons',
       title = 'AI-annotated aggression',
       subtitle = 'Proportion of comparisons (y-axis to x-axis)\nthat are more aggressive by conversation length (facets)') + 
  facet_grid(~length) + 
  theme(legend.position = 'none',
        axis.text.x = element_text(angle = 45,hjust = 1))
dev.off()


# SI Figure 23
aggressive.sf <- res %>%
  as_tibble() %>%
  filter(cleaned != -1) %>%
  mutate(choice = cleaned,
         year_1 = lubridate::year(date_1),
         year_2 = lubridate::year(date_2)) %>%
  mutate(period_1 = ifelse(year_1 < 2006,'Greenspan',
                           ifelse(year_1 < 2014,'Bernanke',
                                  ifelse(year_1 < 2018,'Yellen','Powell')))) %>%
  mutate(period_2 = ifelse(year_2 < 2006,'Greenspan',
                           ifelse(year_2 < 2014,'Bernanke',
                                  ifelse(year_2 < 2018,'Yellen','Powell')))) %>%
  count(fed_1,fed_2,choice,direction,reversed,chamber_1,chamber_2,n_1,n_2,nchars_1,nchars_2,year_1,year_2,period_1,period_2) %>%
  spread(choice,n,sep='_') %>%
  mutate_at(vars(choice_1,choice_2),function(x) ifelse(is.na(x),0,x))

aggressive.sf2 <- aggressive.sf %>%
  filter(direction == 'more',reversed) %>%
  mutate(fed_1 = factor(fed_1,levels = c('Yellen','Greenspan','Bernanke','Powell')),
         fed_2 = factor(fed_2,levels = c('Yellen','Greenspan','Bernanke','Powell')))

aggressive.sf2$fed_1 <- data.frame(fed = aggressive.sf2$fed_1,
                                   chamber = aggressive.sf2$chamber_1,
                                   year = aggressive.sf2$year_1,
                                   nchars = aggressive.sf2$nchars_1,
                                   period = aggressive.sf2$period_1)

aggressive.sf2$fed_2 <- data.frame(fed = aggressive.sf2$fed_2,
                                   chamber = aggressive.sf2$chamber_2,
                                   year = aggressive.sf2$year_2,
                                   nchars = aggressive.sf2$nchars_2,
                                   period = aggressive.sf2$period_2)


summary(aggressiveModel <- BTm(cbind(choice_1,choice_2),fed_1,fed_2, ~fed + chamber + factor(year) + log(nchars),
                               id = 'fed',data = aggressive.sf2))


summary(update(aggressiveModel,br=T))

qv <- qvcalc(BTabilities(aggressiveModel))


pdf('./output/figures/SI_figure_23.pdf',width = 7,height = 5)
qv$qvframe %>%
  mutate(chair = row.names(.)) %>%
  as_tibble() %>%
  ggplot(aes(x = estimate,y = reorder(chair,estimate))) + 
  geom_point(size = 3) + 
  geom_errorbarh(aes(xmin = estimate - 1.96*quasiSE,xmax = estimate + 1.96*quasiSE),height = 0) + 
  geom_errorbarh(aes(xmin = estimate - 1.65*quasiSE,xmax = estimate + 1.65*quasiSE),height = 0,size = 1.2) + 
  geom_vline(xintercept = 0,linetype = 'dashed') + 
  theme_bw() + 
  labs(x = 'More aggressive conversations',y = 'Fed Chair',
       title = 'Bradley-Terry measure of aggression',
       subtitle = 'Estimated aggression relative to Yellen')
dev.off()

# EOF